{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Analysis of multivariate data\n",
    "\n",
    "- Regression line\n",
    "- Correlation\n",
    "\n",
    "Author:  Thomas Haslwanter, Date:    Feb-2017"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.stats as stats\n",
    "import pandas as pd\n",
    "from numpy.linalg import lstsq\n",
    "from urllib.request import urlopen\n",
    "import statsmodels.api as sm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "## Regression Line\n",
    "\n",
    "Fit a line, using the powerful \"ordinary least square\" method of pandas.\n",
    "\n",
    "*Data from 24 type 1 diabetic patients, relating Fasting blood glucose (mmol/l) to mean circumferential shortening velocity (%/sec), derived form echocardiography.*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "# Get the data\n",
    "url_base = 'https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_altman/'\n",
    "inFile = 'altman_11_6.txt'\n",
    "url = url_base + inFile\n",
    "data = np.genfromtxt(urlopen(url), delimiter=',')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Solve equations \"by hand\" ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([ 1.098,  0.022]), array([ 0.986]), 2, array([ 54.079,   1.838]))\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEYVJREFUeJzt3XuMXOV9xvHnMevslsTc4jWpudhRFZwEEkwywbTQBkMV\nDFR2IrWo2E0KJUIJSUpQVCANDYrCH23oJbFQAIu4Tmtw1CY4SUmJCLQNlgqGNVcDAUeFLOaSXWq1\n5iJvWfnXP2YcFnt3Z3bm7LznvPP9SCvv7BzN+Wk859l33/M773FECACQlzmpCwAAFI9wB4AMEe4A\nkCHCHQAyRLgDQIYIdwDIEOEOABki3AEgQ4Q7AGSoL9WO58+fH4sXL061ewCopG3btr0UEYPNtksW\n7osXL9bQ0FCq3QNAJdn+RSvbMS0DABki3AEgQ4Q7AGSIcAeADBHuAJAhwh0ACjCye4/Ou/Eejby8\nJ3Upkgh3ACjE2rt26P5ndmntnTtSlyIpYZ87AORgyVW3a2x8768eb9w6rI1bh9XfN0dPXnN2sroY\nuQNAB7Zcvlwrly7UwNx6nA7MnaNVSxdqyxXLk9ZFuANABxYcMqB5/X0aG9+r/r45Ghvfq3n9fVow\nbyBpXUzLAECHXnplTGuWLdLqk4/VLfcNa7QEJ1UdEUl2XKvVgrVlAGBmbG+LiFqz7ZiWAYAMEe4A\nkCHCHQAyRLgDQIYIdwDIEOEOABki3AEgQ4Q7AGSIcAeADBHuAJAhwh0AMkS4A0CGCHcAyBDhDgAZ\nItwBIEOEOwBkiHAHgAwR7gCQIcIdADJEuANAhgh3AMgQ4Q4AGSLcASBDhDsAZIhwB4AMNQ132+tt\nj9jePsXzh9r+F9sP237M9oXFlwkAmIlWRu4bJK2Y5vnPSHo8Ik6UdLqkv7H9ls5LAwC0q2m4R8Td\nknZNt4mkebYt6W2NbceLKQ8A0I4i5tyvk/QeSc9LelTSpRGxd7INbV9se8j20OjoaAG7BgBMpohw\nP0vSQ5IWSloq6Trbh0y2YUSsi4haRNQGBwcL2DUAYDJFhPuFkm6Nup9LelrSuwt4XQBAm4oI92FJ\nZ0qS7SMlLZH0XwW8LgCgTX3NNrC9SfUumPm2d0q6WtJcSYqIGyR9VdIG249KsqQrIuKlWasYANBU\n03CPiPObPP+8pI8UVhEAoGNcoQpgWiO79+i8G+/RyMt7UpeCGSDcAUxr7V07dP8zu7T2zh2pS8EM\nNJ2WAdCbllx1u8bG37hkZePWYW3cOqz+vjl68pqzE1aGVjByBzCpLZcv18qlCzUwtx4TA3PnaNXS\nhdpyxfLElaEVhDuASS04ZEDz+vs0Nr5X/X1zNDa+V/P6+7Rg3kDq0tACpmUATOmlV8a0ZtkirT75\nWN1y37BGOalaGY6IJDuu1WoxNDSUZN8AUFW2t0VErdl2TMsAQIYIdwDIEOEOABki3AEgQ4Q7AGSI\ncAeADBHumBUsNgWkRbhjVrDYFJAWV6iiUCw2BZQDI3cUisWmgHIg3FEoFpsCyoFpGRSOxaaA9Fg4\nDAAqhIXDAKCHEe4AkCHCHQAyRLgDQIYIdwDIEOEOABki3AEgQ4Q7AGSIcAeADBHuAJAhwh0AMkS4\nA0CGCHcAyBDhDgAZahruttfbHrG9fZptTrf9kO3HbP+02BIBADPVysh9g6QVUz1p+zBJ35S0MiKO\nl/QHxZQGAGhX03CPiLsl7Zpmk9WSbo2I4cb2IwXVBgBoUxFz7sdJOtz2f9jeZvsTU21o+2LbQ7aH\nRkdHC9g1AGAyRYR7n6QPSjpX0lmS/sL2cZNtGBHrIqIWEbXBwcECdg0AmEwRN8jeKem/I+JVSa/a\nvlvSiZKeKuC1AQBtKGLk/gNJp9nus32wpGWSnijgdQEAbWo6cre9SdLpkubb3inpaklzJSkiboiI\nJ2z/WNIjkvZKuikipmybBADMvqbhHhHnt7DNtZKuLaQiAEDHuEIVADJEuANAhgh3AMgQ4Q4AGSLc\nASBDhDsAZIhwB4AMEe4AkCHCHQAyRLgnMrJ7j8678R6NvLwndSkAMkS4J7L2rh26/5ldWnvnjtSl\nAMhQEUv+YgaWXHW7xsb3/urxxq3D2rh1WP19c/TkNWcnrAxAThi5d9mWy5dr5dKFGphbf+sH5s7R\nqqULteWK5YkrA5ATwr3LFhwyoHn9fRob36v+vjkaG9+ref19WjBvIHVpADLCtEwCL70ypjXLFmn1\nycfqlvuGNcpJVQAFc0Qk2XGtVouhoaEk+57MyO49+uymB3Xd6pMYRQMoLdvbIqLWbDumZRroXgGQ\nk56flqF7BUCOen7kTvcKgBz1fLjTvQKgm7p1dXrPh7v0RvfK5ktO1ZplizT6yljqkgBkqlvn9+iW\nAYAu2P/83j4zPb9HtwwAlEi3z+8R7gDQBd0+v9fzrZAA0C3dvDqdOXcAqBDm3AGghxHuAJAhwh0A\nMkS4A0CGCHcAyBDhDgAZItwBIEOEOwBkqGm4215ve8T29ibbfcj2uO3fL648AEA7Whm5b5C0YroN\nbB8k6a8k3VFATQCADjUN94i4W9KuJpt9TtL3JI0UURQAoDMdz7nbPkrSxyRd38K2F9sesj00Ojra\n6a4BAFMo4oTq1yVdEREHrkK/n4hYFxG1iKgNDg4WsGsAwGSKWPK3Juk7tiVpvqRzbI9HxPcLeG0A\nQBs6DveIeOe+721vkHQbwQ4AabXSCrlJ0j2Sltjeafsi25+y/anZLw+9qlt3iAdy1XTkHhHnt/pi\nEXFBR9UADRPvEH/Nx96XuhygcrjNHkpl/zvEb9w6rI1bh2d8h3ig17H8AEql23eIB3JFuKNUun2H\neCBXTMugdLp5h3ggV46IJDuu1WoxNDSUZN/Ix8juPfrspgd13eqTGN2jJ9jeFhG1ZtsxLYNKm9hV\nA+ANTMugkuiqAabHyB2VRFcNMD3CHZVEVw0wPcIdlbWvq2bzJadqzbJFGn1lLHVJbWGpBcwGumWA\nxK7a/Khuvm9Ya04+lqUW0FSr3TKcUEUhaEmcOU4KYzYxLYNC0JI4c5wUxmxi5I6OMPpsHyeFMZsY\nuaMjjD47k8tJYZQPI3d0hNFnZ278+Bvnxa756AkJK0FuCHd0jIW+gPKhFRLAm9D5VG4sHAagLXQ+\n5YFwL7mirl7kKkg0s+Sq27X4yh9p49ZhRdQ7nxZf+SMtuer21KWhDYR7yRU1imI0hmbofMoLJ1RL\nqqj+cfrQ0So6n/LCyL2kihpFMRrDTNB3nw9G7iVV1CiK0Rhmgr77fDByT6SVE5xFjaIYjQG9hz73\nRFjmFUA7WPK3pDjBCaAbmJbpMk5wAugGwr3LOMEJoBuYlkmAhbYAzDZOqAIzwKJaSI2Fw4BZwDIO\nqAqmZYAW0OWEqmHkDrSALidUTdNwt73e9ojt7VM8v8b2I7Yftf2ftk8svkwgLbqcUDWtjNw3SFox\nzfNPS/pwRLxP0lclrSugLqB0WMYBVdJSt4ztxZJui4hpVxKyfbik7RFxVLPXpFsGAGYuVbfMRZKm\nvG2L7YttD9keGh0dLXjX7eEORSgCnyOUTWHhbnu56uF+xVTbRMS6iKhFRG1wcLCoXXeE1jYUgc8R\nyqaQaRnb75e0WdLZEfFUKztOPS2zf2vbPlVpbeNimnKo+ucI1dO1aRnbx0q6VdLHWw32Mqh6axsj\nxXKo+ucI+Wp6EZPtTZJOlzTf9k5JV0uaK0kRcYOkL0t6u6Rv2pak8VZ+q6RW1dY2LqYpl6p+jpC/\npuEeEec3ef6Tkj5ZWEVdVMUFvLZcvlzX/OsTuuOxF7Xn9b0amDtHZx3/Dn3p3PekLq1nVfFzhPz1\n9PIDVbxfJCPF8qni5wj56+lwrypGigCaYclfAKgQlvwFgB5GuANAhgh3AMgQ4Q4AGSLcASBDhDsA\nZKhy4c7Sqt3F+w1UU+XCnQWzuov3G6imylzExNKq3cX7DZRTdhcxsbRqd/F+A9VWmXBnwazu4v0G\nqq1SC4exYFZ38X4D1VWZOXdgJrgNIXKV3Zw7MBN0+aDXVWpaBmiG2xACdYzckRW6fIA6wh1ZocsH\nqGNaBtmhywegWwYAKoVuGQDoYYQ7AGSIcAcwq1g2Og3CHcCs4oKyNOiWATAruKAsLUbuAGYFF5Sl\n1TPhzrwf0F1cUJZWz4Q7835A9+27oGzzJadqzbJFGn1lLHVJPSP7i5i4XRyAnHARUwPzfgB6Ufbh\nzrwfgF7UE62QLCQFoNc0nXO3vV7S70kaiYgTJnnekr4h6RxJr0m6ICIeaLZjFg4DgJkrcs59g6QV\n0zx/tqR3Nb4ulnR9KwUCAGZP03CPiLsl7Zpmk1WS/iHq7pV0mO1fL6pAAMDMFXFC9ShJz054vLPx\nswPYvtj2kO2h0dHRAnYNAJhMV7tlImJdRNQiojY4ONjNXQNATyki3J+TdMyEx0c3fgYASKSIcP+h\npE+47hRJ/xsRLxTwugCANrXSCrlJ0umS5kv6paSrJc2VpIi4odEKeZ3qHTWvSbowIpr2ONoelfSL\nFuucL+mlFrdNgfo6Q33tK3NtEvV1arL6FkVE03ntZGvLzITtoVb6OlOhvs5QX/vKXJtEfZ3qpL7s\nlx8AgF5EuANAhqoS7utSF9AE9XWG+tpX5tok6utU2/VVYs4dADAzVRm5AwBmoBLhbvsg2w/avi11\nLfuzfZjt79r+me0nbP9m6pomsn2Z7cdsb7e9yXbShextr7c9Ynv7hJ8dYfsntnc0/j28RLVd2/i/\nfcT2ZtuHpahtqvomPPcF22F7foraGjVMWp/tzzXew8dsf61M9dleavte2w81lkY5OVFtx9j+d9uP\nN96nSxs/b/vYqES4S7pU0hOpi5jCNyT9OCLeLelElahO20dJ+lNJtcZyzQdJ+sO0VU26yuiVku6K\niHdJuqvxOIUNOrC2n0g6ISLeL+kpSV/sdlETbNAkK7TaPkbSRyQNd7ug/WzQfvXZXq764oInRsTx\nkv46QV37bNCB79/XJH0lIpZK+nLjcQrjkr4QEe+VdIqkz9h+rzo4Nkof7raPlnSupJtS17I/24dK\n+h1J35KkiPi/iPiftFUdoE/Sr9nuk3SwpOdTFjPFKqOrJH278f23JX20q0U1TFZbRNwREeONh/eq\nvrxGEtOs0Pp3ki6XlPQE2hT1fVrSX0bEWGObka4X1jBFfSHpkMb3hyrR8RERL+y7D0ZEvKz6IPEo\ndXBslD7cJX1d9Q/ugXe5Tu+dkkYl/X1j2ugm229NXdQ+EfGc6iOlYUkvqL40xB1pq5rUkROWrHhR\n0pEpi5nGn0i6PXURE9leJem5iHg4dS1TOE7Sb9veavuntj+UuqD9fF7StbafVf1YSfmXmSTJ9mJJ\nJ0naqg6OjVKHu+19d4DalrqWKfRJ+oCk6yPiJEmvKt2UwgEa83OrVP8ltFDSW23/Udqqphf19q3S\ntXDZ/pLqfzrfnLqWfWwfLOnPVZ9OKKs+SUeoPtXwZ5L+qbFkSVl8WtJlEXGMpMvU+Cs8Fdtvk/Q9\nSZ+PiN0Tn5vpsVHqcJd0qqSVtp+R9B1JZ9jemLakN9kpaWdEbG08/q7qYV8Wvyvp6YgYjYjXJd0q\n6bcS1zSZX+67wUvj32R/uk/G9gWq32pyTZSrd/g3VP/F/XDjGDla0gO235G0qjfbKenWxs187lP9\nL/BkJ30n8ceqHxeS9M+SkpxQlSTbc1UP9psjYl9NbR8bpQ73iPhiRBwdEYtVPxH4bxFRmpFnRLwo\n6VnbSxo/OlPS4wlL2t+wpFNsH9wYLZ2pEp3wneCHqh9kavz7g4S1vIntFapPC66MiNdS1zNRRDwa\nEQsiYnHjGNkp6QONz2VZfF/SckmyfZykt6hcC3U9L+nDje/PkLQjRRGN4/Nbkp6IiL+d8FT7x0ZE\nVOJL9ZUpb0tdxyR1LZU0JOkR1T/Ih6euab/6viLpZ5K2S/pHSf2J69mk+vz/66qH0UWS3q56J8AO\nSXdKOqJEtf1c9TuNPdT4uqFM791+zz8jaX6Z6lM9zDc2Pn8PSDqjZPWdJmmbpIdVn+P+YKLaTlN9\nyuWRCZ+1czo5NrhCFQAyVOppGQBAewh3AMgQ4Q4AGSLcASBDhDsAZIhwB4AMEe4AkCHCHQAy9P+e\nb3ujZYMyEQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x176b1253550>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# First I have to delete rows containing \"nan\"\n",
    "a,b = np.where(np.isnan(data))\n",
    "data = np.delete(data, a, axis=0)\n",
    "\n",
    "x,y = data[:,0], data[:,1]\n",
    "plt.plot(x,y,'*')\n",
    "\n",
    "# Create the design matrix\n",
    "Xmat = sm.add_constant(x)\n",
    "\n",
    "# Calculate the parameters\n",
    "params = lstsq(Xmat, y)\n",
    "np.set_printoptions(precision=3)\n",
    "print(params)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### ... then solve them with *pandas* and *statsmodels*\n",
    "\n",
    "pandas handles \"nan\" gracefully, and also provides more information about the fit. So let's use pandas, and compare the results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                 Results: Ordinary least squares\n",
      "=================================================================\n",
      "Model:              OLS              Adj. R-squared:     0.134   \n",
      "Dependent Variable: Vcf              AIC:                -3.1672 \n",
      "Date:               2017-02-04 15:13 BIC:                -0.8962 \n",
      "No. Observations:   23               Log-Likelihood:     3.5836  \n",
      "Df Model:           1                F-statistic:        4.414   \n",
      "Df Residuals:       21               Prob (F-statistic): 0.0479  \n",
      "R-squared:          0.174            Scale:              0.046957\n",
      "-------------------------------------------------------------------\n",
      "              Coef.    Std.Err.     t      P>|t|    [0.025   0.975]\n",
      "-------------------------------------------------------------------\n",
      "Intercept     1.0978     0.1175   9.3446   0.0000   0.8535   1.3421\n",
      "glucose       0.0220     0.0105   2.1010   0.0479   0.0002   0.0437\n",
      "-----------------------------------------------------------------\n",
      "Omnibus:              1.717        Durbin-Watson:           1.802\n",
      "Prob(Omnibus):        0.424        Jarque-Bera (JB):        1.270\n",
      "Skew:                 0.562        Prob(JB):                0.530\n",
      "Kurtosis:             2.756        Condition No.:           29   \n",
      "=================================================================\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import statsmodels.formula.api as smf\n",
    "\n",
    "model_fit = smf.ols('Vcf~glucose', df).fit()\n",
    "\n",
    "print(model_fit.summary2())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Correlation\n",
    "\n",
    "Pearson correlation, and two types of rank correlation (Spearman, Kendall)\n",
    "\n",
    "*Comparing age and percentage of body-fat (measured by dual-photon absorptiometry) for 18 normal adults.*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "# Get the data\n",
    "inFile = 'altman_11_1.txt'\n",
    "url = url_base + inFile\n",
    "data = np.genfromtxt(urlopen(url), delimiter=',')\n",
    "\n",
    "x = data[:,0]\n",
    "y = data[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'pearson': 0.79208623217849128, 'spearman': 0.75387958553761558, 'kendall': 0.57620948508912284}\n"
     ]
    }
   ],
   "source": [
    "# Calculate correlations\n",
    "corr = {}\n",
    "corr['pearson'], _ = stats.pearsonr(x,y)\n",
    "corr['spearman'], _ = stats.spearmanr(x,y)\n",
    "corr['kendall'], _ = stats.kendalltau(x,y)\n",
    "\n",
    "print(corr)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Spearman's rho = 0.754, and Pearson's r (rankordered) = 0.754\n"
     ]
    }
   ],
   "source": [
    "# Show that Spearman's rho is just the Pearson's R of the rank-ordered data\n",
    "r_rankordered = stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0]\n",
    "print(\"Spearman's rho = {0:5.3f}, and Pearson's r (rankordered) = {1:5.3f}\".format(corr['spearman'], r_rankordered))"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
